;************************************************************************
;*
;* Copyright:
;*	Freescale Semiconductor, INC. All Rights Reserved.  
;*  You are hereby granted a copyright license to use, modify, and
;*  distribute the SOFTWARE so long as this entire notice is
;*  retained without alteration in any modified and/or redistributed
;*  versions, and that such modified versions are clearly identified
;*  as such. No licenses are granted by implication, estoppel or
;*  otherwise under any patents or trademarks of Freescale Semiconductor, 
;*  Inc. This software is provided on an "AS IS" basis and without warranty.
;*
;*  To the maximum extent permitted by applicable law, FREESCALE 
;*  DISCLAIMS ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING 
;*  IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
;*  PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH REGARD TO THE 
;*  SOFTWARE (INCLUDING ANY MODIFIED VERSIONS THEREOF) AND ANY 
;*  ACCOMPANYING WRITTEN MATERIALS.
;* 
;*  To the maximum extent permitted by applicable law, IN NO EVENT
;*  SHALL FREESCALE BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING 
;*  WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS 
;*  INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
;*  LOSS) ARISING OF THE USE OR INABILITY TO USE THE SOFTWARE.   
;* 
;*  Freescale assumes no responsibility for the maintenance and support
;*  of this software
;*********************************************
;* FILENAME: iir32.s
;*
;* PURPOSE: IIR32 module source file, containing functions for allocating/deallocating 
;*	        data structures for filter and computing an Infinite Impulse
;*	        Responce filter.
;*
;* AUTHOR: original code was written by Andrey Butok,
;*		   optimized for eMAC unit by Dmitriy Karpenko	        
;*********************************************

 .section .text       ;-=Locate the code in the ".text" section.=-
;#define __EMAC_H
;#include "emac.h"

 .ALIGN 4
 .XDEF _IIR32_EMAC
 .extern _malloc
 .extern _free
 
 
 
;******************************************************
;* NAME: void IIR32( struct tIirStruct *pIIR, Frac32* pX, Frac32* pY, unsigned int n)
;*
;* DESCRIPTION: Computes a Infinite Impulse Response (IIR) filter for a array of 32-bit fractional data values.
;******************************************************
;* a2          pIIR        - Pointer to a data structure containing private data for the iir filter
;* 68(a7)      pX          - Pointer to the input vector of n data elements
;* 72(a7)      pY          - Pointer to the output vector of n data elements
;* d2          k           - Counter for inner loop
;* d1          i           - Counter for outer loop
;* d0          N           - Length of coefficients vector(N<=n)
;* a0          pCurY       - Pointer to the current Y
;* a1          pCurX       - Pointer to the current X
;* a3          pCurCoef    - Pointer to the current coefficient
;* a4          pCurHistory - Pointer to the current element of history buffer
;* a5          pPredY      - Pointer to the previous Y
;******************************************************
_IIR32_EMAC:
;---=Saving values of used registers=---
 lea 		-60(a7),a7
 movem.l 	d0-d7/a0-a6,(a7)
;//Saving values of MAC status register
 lea		-4(a7),a7
 move.l		MACSR,d0
 move.l		d0,(a7)
 lea		4(a7),a7
 
;//defining the mode of MAC unit
;#ifdef __FRACT_M
move.l		#0x00000030,MACSR
;#else
;move.l 		#0x00000000,MACSR
;#endif
;---=Most useful parameters are moved from stack to registers.=--- 
 move.l 	72(a7),a0         ;  pCurY=pY;       -= Pointer to the current Y.=-
 move.l 	68(a7),a1         ;  pCurX=pX;       -= Pointer to the current X.=-
 move.l 	64(a7),a2         ;  N=pIIR->iIirCoefCount/2+1;
 move.l 	4(a2),d0
 lsr.l 		#1,d0
 addq.l 	#1,d0
;---====== Begin of getting Y[1]..Y[N] ======---
 move.l 	#0,ACC0           ;-=accumulators initialization=-
 move.l	 	#0,ACC1
 move.l 	#0,ACC2
 move.l 	#0,ACC3
 ;//computing a block of output samples from Y[1] to Y[N-N%4]
 moveq.l 	#4,d1          ;  for(i=4;i<=N;i+=4) { //Begin of outer loop #1
.FORi1:
 cmp.l 		d0,d1              ;  //Comparing i with N
 bhi 		.ENDFORi1          ;  //If (i>N) then jump to .ENDFORi1=-
 
 move.l 	68(a7),a6      	   ; pCurX=pX+i-4; //Current sample pointer initialization
 lea 		(-16,a6,d1.l*4),a1

 move.l 	(a2),a3        	   ; pCurCoef=pIIR->pIirCoef; //Current coefficient for input pointer initialization
 
 movem.l 	(a1),d3-d6     	   ;d3=*pCurX++; d4=*pCurX++; d5=*pCurX++; d6=*pCurX; pCurX-=3;
 move.l 	(a3)+,a6       	   ;a6=*pCurCoef++;	
   
 mac.l 		a6,d6,<<,-(a1),d6,ACC3  	;ACC3+=a6*d6; d6=*--pCurX; //getting next input sample
 mac.l 		a6,d5,<<,ACC2           	;ACC2+=a6*d5;
 mac.l 		a6,d4,<<,ACC1           	;ACC1+=a6*d4;
 mac.l 		a6,d3,<<,(a3)+,a6,ACC0  	;ACC0+=a6*d3; a6=*pCurCoef++;//getting next coefficient
 ;//cycle of multiplying 8 input samples on 4 coefficients per iteration
 move.l 	#4,d2          				    ;for(k=4; k<i; k+=4) { //Begin of inner loop #1

.FORk1:                            		
 cmp.l 		d1,d2                         	;//comparing k with i
 bcc  		.ENDFORk1                     	;//if (k>=i) then jump to .ENDFORk1
 
 adda.l 	#4,a3                      		;pCurCoef++; //skip the coefficient for output sample
 
 mac.l 		a6,d5,<<,-(a1),d5,ACC3  	;ACC3+=a6*d5; d5=*--pCurX;
 mac.l 		a6,d4,<<,ACC2           	;ACC2+=a6*d4;
 mac.l 		a6,d3,<<,ACC1           	;ACC1+=a6*d3;
 mac.l 		a6,d6,<<,(a3)+,a6,ACC0  	;ACC0+=a6*d6; a6=*pCurCoef++;
 
 adda.l 	#4,a3                      		;pCurCoef++; //skip the coefficient for output sample
 
 mac.l 		a6,d4,<<,-(a1),d4,ACC3  	;ACC3+=a6*d4; d4=*--pCurX;
 mac.l 		a6,d3,<<,ACC2           	;ACC2+=a6*d3;
 mac.l 		a6,d6,<<,ACC1           	;ACC1+=a6*d6;
 mac.l 		a6,d5,<<,(a3)+,a6,ACC0  	;ACC0+=a6*d5; a6=*pCurCoef++;
 
 adda.l 	#4,a3                      		;pCurCoef++; //skip the coefficient for output sample
 
 mac.l 		a6,d3,<<,-(a1),d3,ACC3    	;ACC3+=a6*d3; d3=*--pCurX;
 mac.l 		a6,d6,<<,ACC2             	;ACC2+=a6*d6;
 mac.l 		a6,d5,<<,ACC1             	;ACC1+=d6*d5;
 mac.l 		a6,d4,<<,(a3)+,a6,ACC0    	;ACC0+=a6*d4; a6=*pCurCoef++;

 adda.l 	#4,a3                      		;pCurCoef++; //skip the coefficient for output sample
 
 mac.l 		a6,d6,<<,-(a1),d6,ACC3  	;ACC3+=a6*d6; d6=*--pCurX;
 mac.l 		a6,d5,<<,ACC2           	;ACC2+=a6*d5;
 mac.l 		a6,d4,<<,ACC1           	;ACC1+=a6*d4;
 mac.l 		a6,d3,<<,(a3)+,a6,ACC0  	;ACC0+=a6*d3; a6=*pCurCoef++;

 addq.l 	#4,d2							;//incrementing k
 bra  		.FORk1							;//jumping to .FORk1
 
.ENDFORk1:									;} //end of inner loop #1
;//multiplying 3 first input samples on 3 coefficients
 adda.l 	#4,a3                      		;pCurCoef++; //skip the coefficient for output sample
 
 mac.l 		a6,d3,<<,ACC1              	;ACC1+=a6*d3;
 mac.l 		a6,d4,<<,ACC2              	;ACC2+=a6*d4;
 mac.l 		a6,d5,<<,(a3)+,a6,ACC3     	;ACC3+=a6*d5; a6=*pCurCoef++;
 
 adda.l 	#4,a3                      		;pCurCoef++; //skip the coefficient for output sample
 
 mac.l 		a6,d3,<<,ACC2               ;ACC2+=a6*d3;
 mac.l 		a6,d4,<<,(a3)+,a6,ACC3      ;ACC3+=a6*d4; a6=*pCurCoef++;
 
 adda.l	 	#4,a3                      		;pCurCoef++; //skip the coefficient for output signal
 
 mac.l 		a6,d3,<<,ACC3               ;ACC3+=a6*d3;
 
 ;//Testing that history buffer is not empty => this is not the first calling of this subroutine
tst.l		12(a2)							;if (pIIR=>iIirHistoryCount>0) { //if #1
beq			.ENDBUFy						;//if (pIIR=>iIirHistoryCount=0) then jump to .ENDBUFy

move.l     	8(a2),a6         				;pCurX=pIIR->pIirHistory+i*2-7; //Current sample pointer initialization
lsl.l		#1,d1
lea 		(-28,a6,d1.l*4),a1
lsr.l		#1,d1

move.l 		(a2),a6           				;pCurCoef=pIIR->pIirCoef+N*2-2; //Current coefficient for input pointer initialization
lsl.l		#1,d0
lea			(-8, a6, d0.l*4), a3
lsr.l		#1,d0
 
move.l    	(a1)+,d3						;d3=*pCurX++;				
adda.l		#4,a1							;pCurX++; // skip the output sample from history buffer
move.l		(a1)+,d4						;d4=*pCurX++;
adda.l		#4,a1							;pCurX++; // skip the output sample from history buffer
move.l		(a1)+,d5						;d5=*pCurX++;
adda.l		#4,a1							;pCurX++; // skip the output sample from history buffer
move.l		(a1)+,d6						;d6=*pCurX++;
adda.l		#4,a1							;pCurX++; // skip the output sample from history buffer
move.l 		-(a3),a6          				;a6=*--pCurCoef;
suba.l		#4,a3							;pCurCoef--; //skip the coefficient for output sample
 
cmp.l		d0,d1							;if (N==i) { //if #2
bne			.CONT							;//if (N!=i) then jump to .CONT
;//multiplying 3 input samples from history buffer on 3 coefficients 
mac.l 		a6,d3,<<,ACC0               ;ACC0+=a6*d3;
mac.l 		a6,d4,<<,ACC1               ;ACC1+=a6*d4;
mac.l 		a6,d5,<<,-(a3),a6,ACC2      ;ACC2+=a6*d5; a6=*--pCurCoef;

suba.l 		#4,a3                      		;pCurCoef--; //skip the coefficient for output sample
 
mac.l 		a6,d4,<<,ACC0               ;ACC0+=a6*d4;
mac.l 		a6,d5,<<,-(a3),a6,ACC1      ;ACC1+=a6*d5; a6=*--pCurCoef;
 
suba.l 		#4,a3                      		;pCurCoef--; //skip the coefficient for output sample
 
mac.l 		a6,d5,<<,ACC0               ;ACC0+=a6*d5;
 
bra			.ENDBUFx						;//jump to .ENDBUFx
  											;} //end if #2
.CONT:										;if (N!=i) { //if #3
 
mac.l		a6,d3,<<,(a1)+,d3,ACC0		;ACC0+=a6*d3; d3=*pCurX++;
mac.l		a6,d4,<<,ACC1				;ACC1+=a6*d4;
mac.l		a6,d5,<<,ACC2				;ACC2+=a6*d5;
mac.l		a6,d6,<<,-(a3),a6,ACC3		;ACC3+=a6*d6; a6=*--pCurCoef;

move.l  	d1,d2             				
addq.l		#4,d2
;//cycle of multiplying 8 input samples from history buffer on 4 coefficients per iteration
.FORk11:                            		;for(k=i+4; k<N; k+=4) { //begin of inner loop #2
cmp.l 		d0,d2                   		;//comparing k with N=-
bcc  		.ENDFORk11                    	;//if (k>=N) then jump to .ENDFORk11
 
suba.l     	#4,a3                      		;pCurCoef--; //skip the coefficient for output sample
adda.l		#4,a1							;pCurX++; //skip the output sample from history buffer
 
mac.l		a6,d4,<<,(a1)+,d4,ACC0		;ACC0+=a6*d4; d4=*pCurX++;
mac.l		a6,d5,<<,ACC1				;ACC1+=a6*d5;
mac.l		a6,d6,<<,ACC2				;ACC2+=a6*d6;
mac.l		a6,d3,<<,-(a3),a6,ACC3		;ACC3+=a6*d3; a6=*--pCurCoef;

suba.l 		#4,a3                      		;pCurCoef--; //skip the coefficient for output sample
adda.l		#4,a1							;pCurX++; //skip the output sample from history buffer
 
mac.l		a6,d5,<<,(a1)+,d5,ACC0		;ACC0+=a6*d5; d5=*pCurX++;
mac.l		a6,d6,<<,ACC1				;ACC1+=a6*d6;
mac.l		a6,d3,<<,ACC2				;ACC2+=a6*d3;
mac.l		a6,d4,<<,-(a3),a6,ACC3		;ACC3+=a6*d4; a6=*--pCurCoef;
 
suba.l 		#4,a3                      		;pCurCoef--; //skip the coefficient for output sample
adda.l		#4,a1							;pCurX++; //skip the output sample from history buffer
 
mac.l		a6,d6,<<,(a1)+,d6,ACC0		;ACC0+=a6*d6; d6=*pCurX++;
mac.l		a6,d3,<<,ACC1				;ACC1+=a6*d3;
mac.l		a6,d4,<<,ACC2				;ACC2+=a6*d4;
mac.l		a6,d5,<<,-(a3),a6,ACC3		;ACC3+=a6*d5; a6=*--pCurCoef;

suba.l 		#4,a3                      		;pCurCoef--; //skip the coefficient for output sample
adda.l		#4,a1							;pCurX++; //skip the output sample from history buffer
 
mac.l		a6,d3,<<,(a1)+,d3,ACC0		;ACC0+=a6*d3; d3=*pCurX++;
mac.l		a6,d4,<<,ACC1				;ACC1+=a6*d4;
mac.l		a6,d5,<<,ACC2				;ACC2+=a6*d5;
mac.l		a6,d6,<<,-(a3),a6,ACC3		;ACC3+=a6*d6; a6=*--pCurCoef;

addq.l 		#4,d2							;//incrementing k
bra  		.FORk11							;//jumping to .FORk11
 
.ENDFORk11:									;} //end of inner loop #2
;//cycle of multiplying 4 input samples from history buffer on 1 coefficient per iteration
suba.l 		#4,a3                      		;pCurCoef--; //skip the coefficient for output sample
adda.l		#4,a1							;pCurX++; //skip the output sample from history buffer
 
move.l		d0,d2							;//d2=(N-1)%4;
subq.l		#1,d2
andi.l		#3,d2
 
.FORk12:									;for(k=(N-1)%4; k>0; k--){//begin of inner loop #3=-
cmpi.l		#0,d2							;//comparing k with 0
beq			.ENDFORk12						;//if (k=0) then jump to .ENDFORk12
 
mac.l		a6,d4,<<,ACC0				;ACC0+=a6*d4;
mac.l		a6,d5,<<,ACC1				;ACC1+=a6*d5;
mac.l		a6,d6,<<,ACC2				;ACC2+=a6*d6;
mac.l		a6,d3,<<,-(a3),a6,ACC3		;ACC3+=a6*d3; a6=*--pCurCoef;
 
move.l		d5,d4							;d4=d5;
move.l		d6,d5							;d5=d6;
move.l		d3,d6							;d6=d3;
 
suba.l		#4,a3							;pCurCoef--; //skip the coefficient for output sample
move.l		(a1)+,d3						;d3=*pCurX++;
adda.l		#4,a1							;pCurX++; //skip the output sample from history buffer
 
subq.l		#1,d2							;//decrementing k
bra			.FORk12							;//jumping to .FORk12
.ENDFORk12:									;} //end of inner loop #3=-
 ;//multiplying 3 input samples on 3 coefficients
mac.l 		a6,d4,<<,ACC0             	;ACC0+=a6*d4;
mac.l 		a6,d5,<<,ACC1             	;ACC1+=a6*d5;
mac.l 		a6,d6,<<,-(a3),a6,ACC2    	;ACC2+=a6*d6; a6=*--pCurCoef;

suba.l 		#4,a3  							;pCurCoef--; //skip the coefficient for output sample
 
mac.l 		a6,d5,<<,ACC0 				;ACC0+=a6*d5;
mac.l 		a6,d6,<<,-(a3),a6,ACC1 		;ACC1+=a6*d6; a6=*--pCurCoef;
 
suba.l 		#4,a3 							;pCurCoef--; //skip the coefficient for output sample
 
mac.l 		a6,d6,<<,ACC0 				;ACC0+=a6*d6
 
.ENDBUFx:									;}//end if #3

move.l		(a2),a6							;pCurCoef=pIIR->pIirCoef+i*2-6;
lsl.l		#1,d1
lea			(-24,a6,d1.l*4),a3
lsr.l		#1,d1

move.l		8(a2),a6						;pPredY=pIIR->pIirHistory+N*2-3;
lsl.l		#1,d0
lea			(-12,a6,d0.l*4),a5
lsr.l		#1,d0

move.l		-(a5),a6						;a6=*--pPredY;
suba.l		#4,a5							;pPredY--; //skip the input sample from history buffer
move.l		(a3)+,d3						;d3=*pCurCoef++;
adda.l		#4,a3							;pCurCoef++; //skip the coefficient for input sample
move.l		(a3)+,d4						;d4=*pCurCoef++;
adda.l		#4,a3							;pCurCoef++; //skip the coefficient for input sample
move.l		(a3)+,d5						;d5=*pCurCoef++;
adda.l		#4,a3							;pCurCoef++; //skip the coefficient for input sample
move.l		(a3)+,d6						;d6=*pCurCoef++;
adda.l		#4,a3							;pCurCoef++; //skip the coefficient for input sample

move.l		d1,d2							;//d2=i+4;
addq.l		#4,d2
;//cycle of multiplying 4 output samples from history buffer on 8 coefficients per iteration
.FORk13:									;for(k=i+4; k<=N; k+=4){ //begin of inner loop #4
cmp.l		d0,d2							;//comparing k with N
bhi			.ENDFORk13						;//if (k>N) then jump to .ENDFORk13

mac.l		a6,d3,<<,(a3)+,d3,ACC0		;ACC0+=a6*d3; d3=*pCurCoef++;
mac.l		a6,d4,<<,ACC1				;ACC1+=a6*d4;
mac.l		a6,d5,<<,ACC2				;ACC2+=a6*d5;
mac.l		a6,d6,<<,-(a5),a6,ACC3		;ACC3+=a6*d6; a5=*--pPredY;

suba.l		#4,a5							;pPredY--; //skip the input sample from history buffer
adda.l		#4,a3							;pCurCoef++; //skip the coefficient for input sample

mac.l		a6,d4,<<,(a3)+,d4,ACC0		;ACC0+=a6*d4; d4=*pCurCoef++;
mac.l		a6,d5,<<,ACC1				;ACC1+=a6*d5;
mac.l		a6,d6,<<,ACC2				;ACC2+=a6*d6;
mac.l		a6,d3,<<,-(a5),a6,ACC3		;ACC3+=a6*d3; a6=*--pPredY;

suba.l		#4,a5							;pPredY--; //skip the input sample from history buffer
adda.l		#4,a3							;pCurCoef++; //skip the coefficient for input sample

mac.l		a6,d5,<<,(a3)+,d5,ACC0		;ACC0+=a6*d5; d5=*pCurCoef++;
mac.l		a6,d6,<<,ACC1				;ACC1+=a6*d6;
mac.l		a6,d3,<<,ACC2				;ACC2+=a6*d3;
mac.l		a6,d4,<<,-(a5),a6,ACC3		;ACC3+=a6*d4; a6=*--pPredY;

suba.l		#4,a5							;pPredY--; //skip the input sample from history buffer
adda.l		#4,a3							;pCurCoef++; //skip the coefficient for input sample

mac.l		a6,d6,<<,(a3)+,d6,ACC0		;ACC0+=a6*d6; d6=*pCurCoef++
mac.l		a6,d3,<<,ACC1				;ACC1+=a6*d3;
mac.l		a6,d4,<<,ACC2				;ACC2+=a6*d4;
mac.l		a6,d5,<<,-(a5),a6,ACC3		;ACC3+=a6*d5; a6=*--pPredY;

suba.l		#4,a5							;pPredY--; //skip the input sample from history buffer
adda.l		#4,a3							;pCurCoef++; //skip the coefficient for input sample

addq.l		#4,d2							;//k+=4;
bra			.FORk13							;//jumping to .FORk13
.ENDFORk13:									;}//end of inner loop #4

move.l		d0,d2							;//d2=N%4;
andi.l		#3,d2
;//cycle of multiplying of 1 output sample from history buffer on 4 coefficients per iteration
.FORk14:									;for(k=N%4; k>0; k--){ //begin of inner loop #5
cmpi.l		#0,d2							;//comparing k with 0
beq			.ENDFORk14						;//if (k=0) then jump to .ENDFORk14

mac.l		a6,d3,<<,ACC0				;ACC0+=a6*d3;
mac.l		a6,d4,<<,ACC1				;ACC1+=a6*d4;
mac.l		a6,d5,<<,ACC2				;ACC2+=a6*d5;
mac.l		a6,d6,<<,-(a5),a6,ACC3		;ACC3+=a6*d6; a6=*--pPredY;

move.l		d4,d3							;d3=d4;
move.l		d5,d4							;d4=d5;
move.l		d6,d5							;d5=d6;

move.l		(a3)+,d6						;d6=*pCurCoef++;
suba.l		#4,a5							;pPredY--; //skip the input sample from history buffer
adda.l		#4,a3							;pCurCoef++; //skip the coefficient for input sample

subq.l		#1,d2							;//decrementing k
bra			.FORk14							;//jumping to .FORk14
.ENDFORk14:									;}//end of inner loop #5
;//multiplying 3 output samples from history buffer on 3 coefficients 
mac.l		a6,d3,<<,ACC0				;ACC0+=a6*d3;
mac.l		a6,d4,<<,ACC1				;ACC1+=a6*d4;
mac.l		a6,d5,<<,-(a5),a6,ACC2		;ACC2+=a6*d5; a6=*--pPredY;

suba.l		#4,a5							;pPredY--; //skip the input sample from history buffer

mac.l		a6,d4,<<,ACC0				;ACC0+=a6*d4;
mac.l		a6,d5,<<,-(a5),a6,ACC1		;ACC1+=a6*d5; a6=*--pPredY;

mac.l		a6,d5,<<,ACC0				;ACC0+=a6*d5;

.ENDBUFy:									;}//end if#1

move.l 		(a2),a6                    		;pCurCoef=pIIR->pIirCoef+i*2-1;
lsl.l 		#1,d1
lea  		(-4,a6,d1.l*4),a3            	
lsr.l 		#1,d1
 
move.l 		72(a7),a5                  		;pPredY=pY;
 
move.l 		(a5)+,a6                 		;a6=*pPredY++;
move.l 		-(a3),d6                 		;d6=*--pCurCoef;
suba.l 		#4,a3                    		;pCurCoef--; //skip the coefficient for input sample
move.l 		-(a3),d5                 		;d5=*--pCurCoef;
suba.l 		#4,a3							;pCurCoef--; //skip the coefficient for input sample
move.l 		-(a3),d4                 		;d4=*--pCurCoef;
suba.l 		#4,a3							;pCurCoef--; //skip the coefficient for input sample
move.l 		-(a3),d3                 		;d3=*--pCurCoef;
 
move.l 		#4,d2                      		;//k=4;
;//cycle of multiplying 4 output samples on 8 coefficients per iteration
.FORk2:										;for(k=4; k<i; k+=4){ //begin of inner loop #6
cmp.l 		d1,d2                       	;//comparing k with i
bcc  		.ENDFORk2                    	;//if (k>=i) then jump to .ENDFORk2
 
suba.l 		#4,a3                   		;pCurCoef--; //skip the coefficient for input sample
 
mac.l 		a6,d6,<<,-(a3),d6,ACC3   	;ACC3+=a6*d6; d6=*--pCurCoef;
mac.l 		a6,d5,<<,ACC2            	;ACC2+=a6*d5;
mac.l 		a6,d4,<<,ACC1            	;ACC1+=a6*d4;
mac.l 		a6,d3,<<,(a5)+,a6,ACC0   	;ACC0+=a6*d3; a6=*pPredY++;

suba.l 		#4,a3							;pCurCoef--; //skip the coefficient for input sample
 
mac.l 		a6,d5,<<,-(a3),d5,ACC3 		;ACC3+=a6*d5; d5=*--pCurCoef;
mac.l 		a6,d4,<<,ACC2          		;ACC2+=a6*d4;
mac.l 		a6,d3,<<,ACC1          		;ACC1+=a6*d3;
mac.l 		a6,d6,<<,(a5)+,a6,ACC0 		;ACC0+=a6*d6; a6=*pPredY++;
 
suba.l 		#4,a3							;pCurCoef--; //skip the coefficient for input sample
 
mac.l 		a6,d4,<<,-(a3),d4,ACC3   	;ACC3+=a6*d4; d4=*--pCurCoef;
mac.l 		a6,d3,<<,ACC2            	;ACC2+=a6*d3;
mac.l 		a6,d6,<<,ACC1            	;ACC1+=a6*d6;
mac.l 		a6,d5,<<,(a5)+,a6,ACC0   	;ACC0+=a6*d5; a6=*pPredY++;
 
suba.l 		#4,a3							;pCurCoef--; //skip the coefficient for input sample
 
mac.l 		a6,d3,<<,-(a3),d3,ACC3 		;ACC3+=a6*d3; d3=*--pCurCoef;
mac.l 		a6,d6,<<,ACC2          		;ACC2+=a6*d6;
mac.l 		a6,d5,<<,ACC1          		;ACC1+=a6*d5;
mac.l 		a6,d4,<<,(a5)+,a6,ACC0 		;ACC0+=a6*d4; a6=*pPredY++;

addq.l 		#4,d2							;//k+=4
bra  		.FORk2							;//jumping to .FORk2
 
.ENDFORk2:									;}//end of inner loop #6
;//multiplying currently computing output samples on 3 first coefficients and moving results to
;//array of output samples
move.l 		(a2),a3 						;pCurCoef=pIIR->pIirCoef+2;    
adda.l 		#8,a3							
move.l 		(a3)+,d3    					;d3=*pCurCoef++;
adda.l 		#4,a3							;pCurCoef++; //skip the coefficient for input sample
move.l 		(a3)+,d4    					;d4=*pCurCoef++;
adda.l 		#4,a3							;pCurCoef++; //skip the coefficient for input sample
move.l 		(a3)+,d5    					;d5=*pCurCoef++;

movclr.l 	ACC0,d6   						;d6=ACC0; ACC0=0;
move.l 		d6,(a0)+						;(*pCurY++)=d6;
mac.l 		d3,d6,<<,ACC1				;ACC1+=d3*d6;
movclr.l 	ACC1,a6      					;a6=ACC1; ACC1=0;
move.l 		a6,(a0)+						;(*pCurY++)=a6;
mac.l 		d3,a6,<<,ACC2				;ACC2+=d3*a6;
mac.l 		d4,d6,<<,ACC2				;ACC2+=d4*d6;
movclr.l 	ACC2,d7      					;d7=ACC2; ACC2=0;
move.l 		d7,(a0)+						;(*pCurY++)=d7;
mac.l 		d3,d7,<<,ACC3				;ACC3+=d3*d7;
mac.l 		d4,a6,<<,ACC3				;ACC3+=d4*a6;
mac.l 		d5,d6,<<,ACC3				;ACC3+=d5*d6;
movclr.l 	ACC3,d6      					;d6=ACC3; ACC3=0;
move.l 		d6,(a0)+						;(*pCurY++)=d6;

  
addq.l 		#4,d1							;//i+=4;
bra  		.FORi1							;//jumping to .FORi1
 
.ENDFORi1:									;}//end of outer loop #1

move.l		d0,d7							;d7=N%4;
andi.l		#3,d7							
move.l		d0,d5							;d5=N-d7+1;
sub.l		d7,d5							
addq.l		#1,d5							

moveq.l		#1,d1							;//i=1;
;//cycle of computing a "tail" of output samples from Y[N-N%4+1] to Y[N]
.FORi2:										;for(i=1; i<=N%4; i++){ //begin of outer loop #2
cmp.l		d7,d1							;//comparing i with N%4
bhi			.ENDFORi2 						;//if (i>N%4) then jump to .ENDFORi2

move.l 		68(a7),a6         				;pCurX=pX+i;
lea 		(0,a6,d5.l*4),a1       
move.l 		72(a7),a6         				;pPredY=pY+i-1;
lea 		(-4,a6,d5.l*4),a5
     
move.l 		(a2),a3           				;pCurCoef=pIIR->pIirCoef; 
      
move.l 		-(a1),d3          				;d3=*--pCurX;
move.l 		(a3)+,d4          				;d4=*pCurCoef++;
mac.l 		d3,d4,<<,ACC0       		;ACC0+=d3*d4;

moveq 		#1,d2              				;//k=1;
move.l 		(a3)+,d4          				;d4=*pCurCoef++;
.FORk3:                        				;for(k=1; k<d5; k++){ //begin of inner loop #7 
cmp.l 		d5,d2              				;//comparing k with d5
bcc 		.ENDFORk3        				;//if (k>d5) then jump to .ENDFORk3
      
move.l 		-(a1),d3          				;d3=*--pCurX;
mac.l 		d3,d4,<<,(a3)+,d4,ACC0 		;ACC0+=d3*d4; d4=*pCurCoef++;
    
move.l 		-(a5),d3          				;d3=*--pPredY;
mac.l 		d3,d4,<<,(a3)+,d4,ACC0 		;ACC0+=d3*d4; d4=*pCurCoef++;

addq.l 		#1,d2             				;// Incrementing k
bra 		.FORk3               			;// Jumping to .FORk3
.ENDFORk3:                     				;}//end of inner loop #7 
 
cmp.l		d1,d7							;//if (i=N%4) then jump to .ENDFORk31
beq			.ENDFORk31						;if (i!=N%4) { //if #4
 ;//Testing that History Buffer is filled => this is not first calling of this function
tst.l 		12(a2)             				;if(pIIR->iIirHistoryCount>0) { //if #5
beq 		.ENDFORk31            			
 
move.l 		8(a2),a6          				;pCurHistory=pIIR->pIirHistory+pIIR->iIirHistoryCount;
move.l 		12(a2),d6
lea 		(0,a6,d6.l*4),a4

move.l 		d5,d2             			
.FORk31:									;for(k=d5; k<N; k++) { //begin of inner loop #8
cmp.l 		d0,d2              				;// Comparing k with N
bcc 		.ENDFORk31            			;//If (k=>N) then jump to .ENDFORk31
 
move.l 		-(a4),d3          				;d3=*--pCurHistory;
mac.l 		d3,d4,<<,(a3)+,d4,ACC0 		;ACC0+=d3*d4; d4=*pCurCoef++;

move.l 		-(a4),d3          				;d3=*--pCurHistory;
mac.l 		d3,d4,<<,(a3)+,d4,ACC0		;ACC0+=d3*d4; d4=*pCurCoef++;
addq.l 		#1,d2             				;//Incrementing k
bra 		.FORk31               			;//jumping to .FORk31 
.ENDFORk31: 								;}//end of inner loop #8
											;}//end if #5
 											;}//end if #4
movclr.l	ACC0,d6            				;d6=ACC0; ACC0=0;
move.l 		d6,(a0)+          				;(*pCurY++)=d6;


addq.l 		#1,d1							;//incrementing i
addq.l      #1,d5       					;incrementing d5
bra 		.FORi2               			;//Jumping to .FORi2

.ENDFORi2:									;//end of outer loop #2
;//End of getting Y[1]..Y[N]

;//begin of getting Y[N+1]..Y[n]
move.l		76(a7),d7						;d7=n;//length of input and output arrays

cmpi.l		#4,d0							;//if (N>=4) then jump to .NMORE3
bcc			.NMORE3							;if (N<4) { //if #6

move.l		d0,d1							;i=N+1;
addq.l		#1,d1
bra			.FORi4							;//jumping to .FORi4

.NMORE3:									;}//end if #6
;//if number of coefficients more and equal 7
move.l		d0,d1							;if (N>=4) { //if #7
addq.l		#4,d1							;//i=N+4

.FORi3:										;for (i=N+4; i<=n; i+=4){ //begin of outer loop #3
cmp.l		d7,d1							;//comparing i with n
bhi			.ENDFORi3						;//if (i>n) then jump to .ENDFORi3

move.l 		(a2),a3							;pCurCoef=pIIR->pIirCoef;
move.l 		68(a7),a6         				;pCurX=pX+i-4;
lea 		(-16,a6,d1.l*4),a1

movem.l 	(a1),d3-d6						;d3=*pCurX++; d4=*pCurX++; d5=*pCurX++; d6=*pCurX; pCurX-=3;
move.l 		(a3)+,a6  						;a6=*pCurCoef++;
  
move.l 		d0,d2							;d2=N%4+4;
andi.l		#3,d2 
addq.l		#4,d2

cmpi.l		#4,d2							;if (d2=4){ //if #8
bne			.CYCLE_BEGIN2
addq.l		#1,d2							;d2+=1;
											;}//end if #8
.CYCLE_BEGIN2:
;//multiplying 4 input samples on first coefficient
mac.l 		a6,d6,<<,-(a1),d6,ACC3  	;ACC3+=a6*d6; d6=*--pCurX;
mac.l 		a6,d5,<<,ACC2           	;ACC2+=a6*d5;
mac.l 		a6,d4,<<,ACC1           	;ACC1+=a6*d4;
mac.l 		a6,d3,<<,(a3)+,a6,ACC0  	;ACC0+=a6*d3; a6=*pCurCoef++;
;//cycle of multiplying 8 input samples on 4 coefficients per iteration
.FORk4:    									;for(k=d2; k<=N; k+=4) { //begin of inner loop #9
cmp.l 		d0,d2    						;//comparing k with N
bhi    		.ENDFORk4						;//if (k>N) then jump to .ENDFORk4
 
adda.l 		#4,a3 							;pCurCoef++; //skip the coefficient for output sample
 
mac.l 		a6,d5,<<,-(a1),d5,ACC3    	;ACC3+=a6*d5; d5=*--pCurX; 
mac.l 		a6,d4,<<,ACC2             	;ACC2+=a6*d4;
mac.l 		a6,d3,<<,ACC1             	;ACC1+=a6*d3;
mac.l 		a6,d6,<<,(a3)+,a6,ACC0    	;ACC0+=a6*d6; a6=*pCurCoef++;
 
adda.l 		#4,a3  							;pCurCoef++; //skip the coefficient for ouput sample
 
mac.l 		a6,d4,<<,-(a1),d4,ACC3   	;ACC3+=a6*d4; d4=*--pCurX;
mac.l 		a6,d3,<<,ACC2            	;ACC2+=a6*d3;
mac.l 		a6,d6,<<,ACC1            	;ACC1+=a6*d6;
mac.l 		a6,d5,<<,(a3)+,a6,ACC0   	;ACC0+=a6*d5; a6=*pCurCoef++;
 
adda.l 		#4,a3 							;pCurCoef++; //skip the coefficient for output sample
 
mac.l 		a6,d3,<<,-(a1),d3,ACC3    	;ACC3+=a6*d3; d3=*--pCurX;
mac.l 		a6,d6,<<,ACC2              	;ACC2+=a6*d6;
mac.l 		a6,d5,<<,ACC1     			;ACC1+=a6*d5;
mac.l 		a6,d4,<<,(a3)+,a6,ACC0  	;ACC0+=a6*d4; a6=*pCurCoef++;

adda.l 		#4,a3 							;pCurCoef++; //skip the coefficient for output sample
 
mac.l		a6,d6,<<,-(a1),d6,ACC3    	;ACC3+=a6*d6; d6=*--pCurX;
mac.l 		a6,d5,<<,ACC2             	;ACC2+=a6*d5;
mac.l 		a6,d4,<<,ACC1             	;ACC1+=a6*d4;
mac.l 		a6,d3,<<,(a3)+,a6,ACC0    	;ACC0+=a6*d3; a6=*pCurCoef++;

addq.l 		#4,d2							;//k+=4;
bra  		.FORk4							;//jumping to .FORk4
 
.ENDFORk4:									;}//end of inner loop #9

move.l		d0,d2							;//k=(N-1)%4
subq.l		#1,d2
andi.l		#3,d2
;//cycle of multiplying 4 input samples on 1 coefficient per iteration
.FORk5:										;for(k=(N-1)%4; k>0; k--) { //begin of inner loop #10
cmpi.l		#0,d2							;//comparing k with 0
beq			.ENDFORk5						;//if (k=0) then jump to .ENDFORk5

adda.l		#4,a3							;pCurCoef++; //skip the coefficient for output sample

mac.l		a6,d5,<<,ACC3				;ACC3+=a6*d5;
mac.l		a6,d4,<<,ACC2				;ACC2+=a6*d4;
mac.l		a6,d3,<<,ACC1				;ACC1+=a6*d3;
mac.l		a6,d6,<<,(a3)+,a6,ACC0		;ACC0+=a6*d6; a6=*pCurCoef++;

move.l		d4,d5							;d5=d4;
move.l		d3,d4							;d4=d3;
move.l		d6,d3							;d3=d6;

move.l		-(a1),d6						;d6=*--pCurX;
subq.l		#1,d2							;//decrementing k
bra			.FORk5							;//jumping to .FORk5

.ENDFORk5:									;}//end of inner loop #10

move.l		72(a7),a6						;pPredY=pY+i-N-3;
move.l		d1,d2
sub.l		d0,d2
lea			(-12,a6,d2.l*4),a5

move.l		(a2),a6							;pCurCoef=pIIR->pIirCoef+N*2-1;
lsl.l		#1,d0
lea			(-4,a6,d0.l*4),a3
lsr.l		#1,d0

movem.l		(a5),d3-d6						;d3=*pPredY++; d4=pPredY++; d5=pPredY++; d6=pPredY++;
adda.l		#16,a5
move.l		-(a3),a6						;a6=*--pCurCoef;

suba.l		#4,a3							;pCurCoef--; //skip the coefficient for input sample

move.l		#8,d2
;//cycle of multiplying 8 output samples on 4 coefficients per iteration
.FORk6:										;for(k=8; k<=N; k+=4) { //begin of inner loop #11
cmp.l		d0,d2							;//comparing k with N
bhi			.ENDFORk6						;//if (k>N) then jump to .ENDFORk6

mac.l		a6,d3,<<,(a5)+,d3,ACC0		;ACC0+=a6*d3; d3=*pPredY++;
mac.l		a6,d4,<<,ACC1				;ACC1+=a6*d4;
mac.l		a6,d5,<<,ACC2				;ACC2+=a6*d5;
mac.l		a6,d6,<<,-(a3),a6,ACC3		;ACC3+=a6*d6; a6=*--pCurCoef;

suba.l		#4,a3							;pCurCoef--; //skip the coefficient for input sample

mac.l		a6,d4,<<,(a5)+,d4,ACC0		;ACC0+=a6*d4; d4=*pPredY++;
mac.l		a6,d5,<<,ACC1				;ACC1+=a6*d5;
mac.l		a6,d6,<<,ACC2				;ACC2+=a6*d6;
mac.l		a6,d3,<<,-(a3),a6,ACC3		;ACC3+=a6*d3; a6=*--pCurCoef;

suba.l		#4,a3							;pCurCoef--; //skip the coefficient for input sample

mac.l		a6,d5,<<,(a5)+,d5,ACC0		;ACC0+=a6*d5; d5=*pPredY++;
mac.l		a6,d6,<<,ACC1				;ACC1+=a6*d6;
mac.l		a6,d3,<<,ACC2				;ACC2+=a6*d3;
mac.l		a6,d4,<<,-(a3),a6,ACC3		;ACC3+=a6*d4; a6=*--pCurCoef;

suba.l		#4,a3							;pCurCoef--; //skip the coefficient for input sample

mac.l		a6,d6,<<,(a5)+,d6,ACC0		;ACC0+=a6*d6; d6=*pPredY++;
mac.l		a6,d3,<<,ACC1				;ACC1+=a6*d3;
mac.l		a6,d4,<<,ACC2				;ACC2+=a6*d4;
mac.l		a6,d5,<<,-(a3),a6,ACC3		;ACC3+=a6*d5; a6=*--pCurCoef;

suba.l		#4,a3							;pCurCoef--; //skip the coefficient for input sample

addq.l		#4,d2							;//k+=4;
bra			.FORk6							;//jumping to .FORk6
.ENDFORk6:									;}//end of inner loop #11

move.l		d0,d2							;d2=(N-4)%4;
subq.l		#4,d2
andi.l		#3,d2
;//cycle of multiplying 4 output samples on 1 coefficient per iteration
.FORk7:										;for(k=d2; k>0; k--) { //begin of inner loop #12
cmpi.l		#0,d2							;//comparing k with 0
beq			.ENDFORk7						;//if (k=0) then jump to .ENDFORk7

mac.l		a6,d3,<<,ACC0				;ACC0+=a6*d3;
mac.l		a6,d4,<<,ACC1				;ACC1+=a6*d4;
mac.l		a6,d5,<<,ACC2				;ACC2+=a6*d5;
mac.l		a6,d6,<<,-(a3),a6,ACC3		;ACC3+=a6*d6; a6=*--pCurCoef;

move.l		d4,d3							;d3=d4;
move.l		d5,d4							;d4=d5;
move.l		d6,d5							;d5=d6;

move.l		(a5)+,d6						;d6=*pPredY++;
suba.l		#4,a3							;pCurCoef--; //skip the coefficient for input sample

subq.l		#1,d2							;//decrementing k
bra			.FORk7							;//jumping to .FORk7
.ENDFORk7:									;} //end of inner loop #12
;//preparing final multiplications
move.l		a6,d6							;d6=a6;	

mac.l		a6,d3,<<,ACC0				;ACC0+=a6*d3;
mac.l		a6,d4,<<,ACC1				;ACC1+=a6*d4;
mac.l		a6,d5,<<,-(a3),a6,ACC2		;ACC2+=a6*d5; a6=*--pCurCoef;

move.l		a6,d3							;d3=a6;
suba.l		#4,a3							;pCurCoef--; //skip the coefficient for input sample

mac.l		a6,d4,<<,ACC0				;ACC0+=a6*d4;
mac.l		a6,d5,<<,-(a3),a6,ACC1		;ACC1+=a6*d5; a6=*--pCurCoef;

mac.l		a6,d5,<<,ACC0				;ACC0+=a6*d5;
;//multiplying currently computing samples on coefficients and storing results
movclr.l	ACC0,d4							;d4=ACC0; ACC0=0;
move.l		d4,(a0)+						;(*pCurY++)=d4;
mac.l		a6,d4,<<,ACC1				;ACC1+=a6*d4;
movclr.l	ACC1,d5							;d5=ACC1; ACC1=0;
move.l		d5,(a0)+						;(*pCurY++)=d5;
mac.l		d3,d4,<<,ACC2				;ACC2+=d3*d4;
mac.l		a6,d5,<<,ACC2				;ACC2+=a6*d5;
movclr.l	ACC2,d2							;d2=ACC2; ACC2=0;
move.l		d2,(a0)+						;(*pCurY++)=d2;
mac.l		d6,d4,<<,ACC3				;ACC3+=d6*d4;
mac.l		d3,d5,<<,ACC3				;ACC3+=d3*d5;
mac.l		a6,d2,<<,ACC3				;ACC3+=a6*d2;
movclr.l	ACC3,d6							;d6=ACC3; ACC3=0;
move.l		d6,(a0)+						;(*pCurY++)=d6;


addq.l		#4,d1							;//i+=4
bra			.FORi3							;//jumping to .FORi3
.ENDFORi3:									;}//end of outer loop #3
											
move.l 		d7,d5							;d5=(n-N)%4;
sub.l		d0,d5    
andi.l		#3,d5
move.l		d7,d1							;d1=n-d5+1;
sub.l		d5,d1
addq.l		#1,d1
											;}//end if #7
;//computing a "tail" of output samples
.FORi4:										;for(i=d1; i<=n; i++) { //begin of outer loop #4
cmp.l  		d7,d1							;//comparing i with n
bhi 		.ENDFORi4 						;//if (i>n) then jump to .ENDFORi4

move.l 		68(a7),a6   					;pCurX=pX+i;
lea 		(0,a6,d1.l*4),a1
move.l 		72(a7),a6 						;pPredY=pY+i-1;
lea 		(-4,a6,d1.l*4),a5
move.l 		(a2),a3       					;pCurCoef=pIIR->pIirCoef;

move.l 		-(a1),d3          				;d3=*--pCurX;
move.l 		(a3)+,d4          				;d4=*pCurCoef++;
mac.l 		d3,d4,<<,ACC0		    	;ACC0+=d3*d4;

moveq 		#1,d2              				;//k=1
move.l 		(a3)+,d4   		       			;d4=*pCurCoef++;

.FORk8:										;for(k=1; k<N; k++) { //begin of inner loop #13
cmp.l 		d0,d2              				;//comparing k with N
bcc 		.ENDFORk8            			;//if (k>=N) then jump to .ENDFORk8

move.l 		-(a1),d3          				;d3=*--pCurX;
mac.l 		d3,d4,<<,(a3)+,d4,ACC0 		;ACC0+=d3*d4; d4=*pCurCoef++;

move.l 		-(a5),d3          				;d3=*--pPredY;
mac.l 		d3,d4,<<,(a3)+,d4,ACC0 		;ACC0+=d3*d4; d4=*pCurCoef++;

addq.l 		#1,d2             				;//Incrementing k-
bra 		.FORk8               			;//jumping to .FORk8
.ENDFORk8:                     				;}//end of inner loop #13

movclr.l 	ACC0,d6            				;d6=ACC0; ACC0=0;
move.l 		d6,(a0)+          				;(*pCurY++)=d6;
addq.l 		#1,d1             				;//Incrementing i
bra 		.FORi4               			;//Jumping to .FORi4
.ENDFORi4:                     				;}//end of outer loop #4
;---====== Begin of History Buffer Loading ======---
move.l 		68(a7),a6         				;pCurX=pX+n-N+1;
sub.l 		d0,d7
lea 		(4,a6,d7.l*4),a1
move.l 		72(a7),a6         				;pCurY=pY+n-N+1;
lea 		(4,a6,d7.l*4),a0       			
move.l 		8(a2),a4          				;pCurHistory=pIIR->pIirHistory;

moveq 		#1,d1              				;for(i=1;i<N;i++) { //begin of outer loop #5
.FORbuf:
cmp.l 		d0,d1              				;//Comparing i with N
bcc 		.ENDbuf              			;//If (i=>N) then jump to .ENDbuf
move.l 		a0,a6             				;*pCurHistory++=*pCurY++;
addq.l 		#4,a0
move.l 		(a6),(a4)+
move.l 		(a1)+,(a4)+       				;*pCurHistory++=*pCurX++;
addq.l 		#1,d1             				;//Incrementing i
bra 		.FORbuf              			;//Jumping to .FORbuf
.ENDbuf:                       				;}//end of outer loop #5
move.l 		4(a2),d7          				; pIIR->iIirHistoryCount=pIIR->iIirCoefCount-1;
subq.l 		#1,d7
move.l 		d7,12(a2)         				;pIIR->iIirHistoryCount=pIIR->iIirCoefCount-1;
; ---====== End of History Buffer Loading ======--

;-=Restoring values of used registers=-
lea			-4(a7),a7
move.l		(a7),d0
move.l		d0,MACSR
lea			4(a7),a7
movem.l 	(a7),d0-d7/a0-a6;
lea 		60(a7),a7
rts

